In [4]:
import pandas as pd
pd.read_csv("C:\\Users\\LENOVO\\Desktop\\Summer-Olympic-medals-1976-to-2008.csv",encoding='cp1252')
Out[4]:
City Year Sport Discipline Event Athlete Gender Country_Code Country Event_gender Medal
0 Montreal 1976.0 Aquatics Diving 3m springboard KÖHLER, Christa Women GDR East Germany W Silver
1 Montreal 1976.0 Aquatics Diving 3m springboard KOSENKOV, Aleksandr Men URS Soviet Union M Bronze
2 Montreal 1976.0 Aquatics Diving 3m springboard BOGGS, Philip George Men USA United States M Gold
3 Montreal 1976.0 Aquatics Diving 3m springboard CAGNOTTO, Giorgio Franco Men ITA Italy M Silver
4 Montreal 1976.0 Aquatics Diving 10m platform WILSON, Deborah Keplar Women USA United States W Bronze
... ... ... ... ... ... ... ... ... ... ... ...
15428 Beijing 2008.0 Wrestling Wrestling Gre-R 66 - 74kg GUENOT, Christophe Men FRA France M Bronze
15429 Beijing 2008.0 Wrestling Wrestling Gre-R 66 - 74kg KVIRKELIA, Manuchar Men GEO Georgia M Gold
15430 Beijing 2008.0 Wrestling Wrestling Gre-R 55 - 60kg RAHIMOV, Vitaliy Men AZE Azerbaijan M Silver
15431 Beijing 2008.0 Wrestling Wrestling Gre-R 60 - 66kg GUENOT, Steeve Men FRA France M Gold
15432 Beijing 2008.0 Wrestling Wrestling Gre-R 96 - 120kg LOPEZ, Mijain Men CUB Cuba M Gold

15433 rows × 11 columns

In [6]:
#Data Cleaning and Preparation:
#Cleanse the dataset by handling missing values, incorrect entries, and data inconsistencies.
#Transform data into appropriate formats for analysis, such as converting years and medal types.
df = pd.read_csv("C:\\Users\\LENOVO\\Desktop\\Summer-Olympic-medals-1976-to-2008.csv", encoding='latin1')

# Check for missing values
print(df.isnull().sum())

# Handle missing values
df['City'].fillna('Unknown', inplace=True)
df['Year'].fillna(df['Year'].mode()[0], inplace=True)
df['Sport'].fillna('Unknown', inplace=True)
df['Discipline'].fillna('Unknown', inplace=True)
df['Event'].fillna('Unknown', inplace=True)
df['Athlete'].fillna('Unknown', inplace=True)
df['Gender'].fillna('Unknown', inplace=True)
df['Country_Code'].fillna('Unknown', inplace=True)
df['Country'].fillna('Unknown', inplace=True)
df['Event_gender'].fillna('Unknown', inplace=True)
df['Medal'].fillna('None', inplace=True)

# Convert 'Year' to numeric and handle any errors
df['Year'] = pd.to_numeric(df['Year'], errors='coerce')

# Capitalize string columns to ensure consistency
df['Sport'] = df['Sport'].str.capitalize()
df['Discipline'] = df['Discipline'].str.capitalize()
df['Event'] = df['Event'].str.capitalize()
df['Athlete'] = df['Athlete'].str.title()
df['Gender'] = df['Gender'].str.capitalize()
df['Country'] = df['Country'].str.capitalize()
df['Event_gender'] = df['Event_gender'].str.capitalize()
df['Medal'] = df['Medal'].str.capitalize()

# Check for unique values in categorical columns to ensure consistency
print(df['Gender'].unique())
print(df['Event_gender'].unique())
print(df['Medal'].unique())

# Standardize 'Gender' and 'Event_gender' entries
df['Gender'] = df['Gender'].replace({'M': 'Men', 'W': 'Women'})
df['Event_gender'] = df['Event_gender'].replace({'M': 'Men', 'W': 'Women'})

# Convert 'Year' to integer
df['Year'] = df['Year'].astype(int)

# Ensure 'Medal' column has consistent capitalization
df['Medal'] = df['Medal'].str.capitalize()

# Create a mapping for medal types if needed
medal_mapping = {'Gold': 3, 'Silver': 2, 'Bronze': 1, 'None': 0}
df['Medal_Value'] = df['Medal'].map(medal_mapping)

print(df.head())
City            117
Year            117
Sport           117
Discipline      117
Event           117
Athlete         117
Gender          117
Country_Code    117
Country         117
Event_gender    117
Medal           117
dtype: int64
['Women' 'Men' 'Unknown']
['W' 'M' 'X' 'Unknown']
['Silver' 'Bronze' 'Gold' 'None']
       City  Year     Sport Discipline           Event  \
0  Montreal  1976  Aquatics     Diving  3m springboard   
1  Montreal  1976  Aquatics     Diving  3m springboard   
2  Montreal  1976  Aquatics     Diving  3m springboard   
3  Montreal  1976  Aquatics     Diving  3m springboard   
4  Montreal  1976  Aquatics     Diving    10m platform   

                    Athlete Gender Country_Code        Country Event_gender  \
0           Köhler, Christa  Women          GDR   East germany        Women   
1       Kosenkov, Aleksandr    Men          URS   Soviet union          Men   
2      Boggs, Philip George    Men          USA  United states          Men   
3  Cagnotto, Giorgio Franco    Men          ITA          Italy          Men   
4    Wilson, Deborah Keplar  Women          USA  United states        Women   

    Medal  Medal_Value  
0  Silver            2  
1  Bronze            1  
2    Gold            3  
3  Silver            2  
4  Bronze            1  
In [7]:
#Exploratory Data Analysis (EDA):
#Conduct initial explorations to understand the distribution of medals, number of participating nations, and other key metrics.
#Visualize the overall trends in data, such as total medals won over the years.
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Distribution of Medals
medal_counts = df['Medal'].value_counts()
print(medal_counts)

sns.countplot(data=df, x='Medal', order=medal_counts.index)
plt.title('Distribution of Medals')
plt.xlabel('Medal')
plt.ylabel('Count')
plt.show()

# Number of Participating Nations
nations = df['Country'].nunique()
print(f"Number of participating nations: {nations}")

nation_medal_counts = df['Country'].value_counts().head(20)
sns.barplot(y=nation_medal_counts.index, x=nation_medal_counts.values)
plt.title('Number of Medals Won by Country (Top 20)')
plt.xlabel('Number of Medals')
plt.ylabel('Country')
plt.show()

# Total Medals Won Over the Years
medals_by_year = df.groupby('Year')['Medal'].count()
print(medals_by_year)

plt.figure(figsize=(10, 6))
sns.lineplot(data=medals_by_year)
plt.title('Total Medals Won Over the Years')
plt.xlabel('Year')
plt.ylabel('Number of Medals')
plt.show()

# Medals Won by Gender Over the Years
medals_by_gender_year = df.groupby(['Year', 'Gender'])['Medal'].count().unstack()
print(medals_by_gender_year)

medals_by_gender_year.plot(kind='bar', stacked=True, figsize=(12, 7))
plt.title('Medals Won by Gender Over the Years')
plt.xlabel('Year')
plt.ylabel('Number of Medals')
plt.legend(title='Gender')
plt.show()

# Top Athletes by Medals Won
top_athletes = df['Athlete'].value_counts().head(10)
print(top_athletes)

sns.barplot(y=top_athletes.index, x=top_athletes.values)
plt.title('Top Athletes by Medals Won')
plt.xlabel('Number of Medals')
plt.ylabel('Athlete')
plt.show()
Medal
Bronze    5258
Gold      5042
Silver    5016
Name: count, dtype: int64
No description has been provided for this image
Number of participating nations: 127
No description has been provided for this image
Year
1976.0    1305
1980.0    1387
1984.0    1459
1988.0    1546
1992.0    1705
1996.0    1859
2000.0    2015
2004.0    1998
2008.0    2042
Name: Medal, dtype: int64
C:\Users\LENOVO\anaconda\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
C:\Users\LENOVO\anaconda\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
No description has been provided for this image
Gender   Men  Women
Year               
1976.0   924    381
1980.0   958    429
1984.0   973    486
1988.0  1011    535
1992.0  1105    600
1996.0  1082    777
2000.0  1126    889
2004.0  1099    899
2008.0  1110    932
No description has been provided for this image
Athlete
PHELPS, Michael           16
TORRES, Dara              12
THOMPSON, Jenny           12
FISCHER, Birgit           12
ANDRIANOV, Nikolay        12
NEMOV, Alexei             12
COUGHLIN, Natalie         11
BIONDI, Matthew           11
VAN ALMSICK, Franziska    10
LEWIS, Carl               10
Name: count, dtype: int64
No description has been provided for this image
In [8]:
#Country-Level Performance Analysis:
#Analyze the performance of countries over the years in terms of medals won.
#Identify the most successful countries in the Olympics and explore factors contributing to their success.
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the data
df = pd.read_csv("C:\\Users\\LENOVO\\Desktop\\Summer-Olympic-medals-1976-to-2008.csv", encoding='latin1')

# Calculate the total medals won by each country
total_medals_by_country = df['Country'].value_counts()
print(total_medals_by_country.head(10))

# Plot the total medals won by each country (Top 20)
plt.figure(figsize=(12, 8))
sns.barplot(y=total_medals_by_country.index[:20], x=total_medals_by_country.values[:20])
plt.title('Total Medals Won by Country (Top 20)')
plt.xlabel('Number of Medals')
plt.ylabel('Country')
plt.show()

# Identify the most successful countries
top_countries = total_medals_by_country.head(10).index
print(f"Top 10 Countries: {top_countries}")

# Filter data for the top countries
df_top_countries = df[df['Country'].isin(top_countries)]

# Group by Year and Country to get the total medals per year for each country
medals_by_country_year = df_top_countries.groupby(['Year', 'Country']).size().unstack()

# Plot the performance of top countries over time
plt.figure(figsize=(14, 8))
medals_by_country_year.plot(kind='line', marker='o', figsize=(14, 8))
plt.title('Performance of Top 10 Countries Over Time')
plt.xlabel('Year')
plt.ylabel('Number of Medals')
plt.legend(title='Country')
plt.show()
Country
United States     1992
Soviet Union      1021
Australia          798
Germany            691
China              679
Russia             638
East Germany       626
Italy              486
Romania            482
United Kingdom     467
Name: count, dtype: int64
No description has been provided for this image
Top 10 Countries: Index(['United States', 'Soviet Union', 'Australia', 'Germany', 'China',
       'Russia', 'East Germany', 'Italy', 'Romania', 'United Kingdom'],
      dtype='object', name='Country')
<Figure size 1400x800 with 0 Axes>
No description has been provided for this image
In [9]:
#Athlete Performance Metrics:
#Evaluate the achievements of athletes, identifying standout performers and their medal hauls.
#Analyze the performance based on athlete demographics like gender and age (if available).
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the data
df = pd.read_csv("C:\\Users\\LENOVO\\Desktop\\Summer-Olympic-medals-1976-to-2008.csv", encoding='latin1')

# Calculate the total medals won by each athlete
total_medals_by_athlete = df['Athlete'].value_counts()
print(total_medals_by_athlete.head(10))

# Plot the total medals won by each athlete (Top 10)
plt.figure(figsize=(12, 8))
sns.barplot(y=total_medals_by_athlete.index[:10], x=total_medals_by_athlete.values[:10])
plt.title('Total Medals Won by Athlete (Top 10)')
plt.xlabel('Number of Medals')
plt.ylabel('Athlete')
plt.show()

# Identify top 10 standout performers
top_athletes = total_medals_by_athlete.head(10).index
print(f"Top 10 Athletes: {top_athletes}")

# Filter data for the top athletes
df_top_athletes = df[df['Athlete'].isin(top_athletes)]
print(df_top_athletes)

# Calculate the total medals won by gender
medals_by_gender = df['Gender'].value_counts()
print(medals_by_gender)

# Plot the total medals won by gender
plt.figure(figsize=(8, 6))
sns.countplot(data=df, x='Gender', order=medals_by_gender.index)
plt.title('Total Medals Won by Gender')
plt.xlabel('Gender')
plt.ylabel('Number of Medals')
plt.show()

# Calculate the total medals won by gender over the years
medals_by_gender_year = df.groupby(['Year', 'Gender'])['Medal'].count().unstack()
print(medals_by_gender_year)

# Plot the total medals won by gender over the years
medals_by_gender_year.plot(kind='bar', stacked=True, figsize=(14, 8))
plt.title('Total Medals Won by Gender Over the Years')
plt.xlabel('Year')
plt.ylabel('Number of Medals')
plt.legend(title='Gender')
plt.show()

# If Age data is available, analyze performance by age
# Example assuming Age column is available
# Calculate the total medals won by age
# medals_by_age = df['Age'].value_counts().sort_index()
# print(medals_by_age)

# Plot the total medals won by age
# plt.figure(figsize=(12, 8))
# sns.barplot(x=medals_by_age.index, y=medals_by_age.values)
# plt.title('Total Medals Won by Age')
# plt.xlabel('Age')
# plt.ylabel('Number of Medals')
# plt.show()
Athlete
PHELPS, Michael           16
TORRES, Dara              12
THOMPSON, Jenny           12
FISCHER, Birgit           12
ANDRIANOV, Nikolay        12
NEMOV, Alexei             12
COUGHLIN, Natalie         11
BIONDI, Matthew           11
VAN ALMSICK, Franziska    10
LEWIS, Carl               10
Name: count, dtype: int64
No description has been provided for this image
Top 10 Athletes: Index(['PHELPS, Michael', 'TORRES, Dara', 'THOMPSON, Jenny', 'FISCHER, Birgit',
       'ANDRIANOV, Nikolay', 'NEMOV, Alexei', 'COUGHLIN, Natalie',
       'BIONDI, Matthew', 'VAN ALMSICK, Franziska', 'LEWIS, Carl'],
      dtype='object', name='Athlete')
           City    Year       Sport   Discipline                   Event  \
689    Montreal  1976.0  Gymnastics  Artistic G.           parallel bars   
691    Montreal  1976.0  Gymnastics  Artistic G.                   vault   
729    Montreal  1976.0  Gymnastics  Artistic G.        team competition   
732    Montreal  1976.0  Gymnastics  Artistic G.         floor exercises   
738    Montreal  1976.0  Gymnastics  Artistic G.                   rings   
...         ...     ...         ...          ...                     ...   
13567   Beijing  2008.0    Aquatics     Swimming  4x100m freestyle relay   
13568   Beijing  2008.0    Aquatics     Swimming  4x200m freestyle relay   
13589   Beijing  2008.0    Aquatics     Swimming  200m individual medley   
13592   Beijing  2008.0    Aquatics     Swimming          100m freestyle   
13597   Beijing  2008.0    Aquatics     Swimming  200m individual medley   

                  Athlete Gender Country_Code        Country Event_gender  \
689    ANDRIANOV, Nikolay    Men          URS   Soviet Union            M   
691    ANDRIANOV, Nikolay    Men          URS   Soviet Union            M   
729    ANDRIANOV, Nikolay    Men          URS   Soviet Union            M   
732    ANDRIANOV, Nikolay    Men          URS   Soviet Union            M   
738    ANDRIANOV, Nikolay    Men          URS   Soviet Union            M   
...                   ...    ...          ...            ...          ...   
13567     PHELPS, Michael    Men          USA  United States            M   
13568     PHELPS, Michael    Men          USA  United States            M   
13589   COUGHLIN, Natalie  Women          USA  United States            W   
13592   COUGHLIN, Natalie  Women          USA  United States            W   
13597     PHELPS, Michael    Men          USA  United States            M   

        Medal  
689    Silver  
691      Gold  
729    Silver  
732      Gold  
738      Gold  
...       ...  
13567    Gold  
13568    Gold  
13589  Bronze  
13592  Bronze  
13597    Gold  

[118 rows x 11 columns]
Gender
Men      9388
Women    5928
Name: count, dtype: int64
No description has been provided for this image
Gender   Men  Women
Year               
1976.0   924    381
1980.0   958    429
1984.0   973    486
1988.0  1011    535
1992.0  1105    600
1996.0  1082    777
2000.0  1126    889
2004.0  1099    899
2008.0  1110    932
No description has been provided for this image
In [10]:
#Sport and Event-Specific Trends:
#Break down the analysis by individual sports and events to identify trends and dominant nations in each.
#Analyze changes in sports disciplines over the years, including the introduction or removal of sports from the Olympics.
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the data
df = pd.read_csv("C:\\Users\\LENOVO\\Desktop\\Summer-Olympic-medals-1976-to-2008.csv", encoding='latin1')

# Identify the most popular sports and events
medals_by_sport = df['Sport'].value_counts()
print(medals_by_sport.head(10))

plt.figure(figsize=(12, 8))
sns.barplot(y=medals_by_sport.index[:10], x=medals_by_sport.values[:10])
plt.title('Number of Medals Awarded in Each Sport (Top 10)')
plt.xlabel('Number of Medals')
plt.ylabel('Sport')
plt.show()

medals_by_event = df['Event'].value_counts()
print(medals_by_event.head(10))

plt.figure(figsize=(12, 8))
sns.barplot(y=medals_by_event.index[:10], x=medals_by_event.values[:10])
plt.title('Number of Medals Awarded in Each Event (Top 10)')
plt.xlabel('Number of Medals')
plt.ylabel('Event')
plt.show()

# Determine dominant nations in each sport
dominant_nations_by_sport = df.groupby(['Sport', 'Country']).size().reset_index(name='Medal Count')
dominant_nations_by_sport = dominant_nations_by_sport.sort_values(['Sport', 'Medal Count'], ascending=[True, False])
dominant_nations_by_sport = dominant_nations_by_sport.groupby('Sport').head(1)
print(dominant_nations_by_sport)

top_sports = medals_by_sport.index[:10]
dominant_nations_top_sports = dominant_nations_by_sport[dominant_nations_by_sport['Sport'].isin(top_sports)]

plt.figure(figsize=(12, 8))
sns.barplot(y='Sport', x='Medal Count', hue='Country', data=dominant_nations_top_sports)
plt.title('Dominant Nations in Each Sport (Top 10 Sports)')
plt.xlabel('Medal Count')
plt.ylabel('Sport')
plt.legend(title='Country')
plt.show()

# Analyze changes in sports disciplines over the years
medals_by_sport_year = df.groupby(['Year', 'Sport']).size().unstack().fillna(0)
print(medals_by_sport_year)

plt.figure(figsize=(14, 8))
medals_by_sport_year[top_sports].plot(kind='line', marker='o')
plt.title('Changes in the Number of Medals Awarded in Top Sports Over the Years')
plt.xlabel('Year')
plt.ylabel('Number of Medals')
plt.legend(title='Sport')
plt.show()
Sport
Aquatics         2210
Athletics        1523
Rowing           1377
Hockey            817
Gymnastics        783
Handball          780
Volleyball        695
Canoe / Kayak     672
Football          669
Basketball        646
Name: count, dtype: int64
No description has been provided for this image
Event
hockey                      817
handball                    780
football                    669
volleyball                  647
basketball                  646
eight with coxswain (8+)    486
team                        459
water polo                  455
baseball                    335
team competition            329
Name: count, dtype: int64
No description has been provided for this image
                 Sport        Country  Medal Count
43            Aquatics  United States          578
56             Archery   Korea, South           52
142          Athletics  United States          299
145          Badminton          China           47
153           Baseball           Cuba          111
172         Basketball  United States          192
186             Boxing           Cuba           56
251      Canoe / Kayak        Germany           96
277            Cycling      Australia           54
333         Equestrian  United States           73
338            Fencing         France          115
356           Football         Brazil          109
393         Gymnastics        Romania          116
412           Handball   Korea, South           96
431             Hockey    Netherlands          159
464               Judo          Japan           57
495  Modern Pentathlon        Hungary           15
531             Rowing        Romania          156
579            Sailing  United States           64
590           Shooting          China           42
636           Softball      Australia           60
640       Table Tennis          China           61
667          Taekwondo   Korea, South           12
702             Tennis  United States           24
704          Triathlon      Australia            4
715         Volleyball         Brazil          102
741      Weightlifting          China           43
824          Wrestling  United States           57
No description has been provided for this image
Sport   Aquatics  Archery  Athletics  Badminton  Baseball  Basketball  Boxing  \
Year                                                                            
1976.0     159.0      6.0      147.0        0.0       0.0        72.0    44.0   
1980.0     159.0      6.0      150.0        0.0       0.0        72.0    44.0   
1984.0     192.0      6.0      161.0        0.0       0.0        72.0    48.0   
1988.0     202.0     24.0      163.0        0.0       0.0        72.0    48.0   
1992.0     228.0     24.0      178.0       24.0      60.0        72.0    48.0   
1996.0     262.0     24.0      180.0       24.0      60.0        72.0    48.0   
2000.0     329.0     24.0      184.0       24.0      72.0        72.0    48.0   
2004.0     332.0     24.0      183.0       24.0      71.0        70.0    44.0   
2008.0     347.0     24.0      177.0       24.0      72.0        72.0    44.0   

Sport   Canoe / Kayak  Cycling  Equestrian  ...  Sailing  Shooting  Softball  \
Year                                        ...                                
1976.0           57.0     36.0        42.0  ...     36.0      21.0       0.0   
1980.0           57.0     36.0        62.0  ...     36.0      21.0       0.0   
1984.0           69.0     42.0        47.0  ...     39.0      33.0       0.0   
1988.0           69.0     45.0        45.0  ...     45.0      39.0       0.0   
1992.0           84.0     50.0        45.0  ...     51.0      39.0       0.0   
1996.0           84.0     52.0        45.0  ...     48.0      45.0      45.0   
2000.0           84.0     75.0        45.0  ...     54.0      51.0      45.0   
2004.0           84.0     76.0        47.0  ...     54.0      51.0      45.0   
2008.0           84.0     71.0        45.0  ...     54.0      45.0      45.0   

Sport   Table Tennis  Taekwondo  Tennis  Triathlon  Volleyball  Weightlifting  \
Year                                                                            
1976.0           0.0        0.0     0.0        0.0        72.0           27.0   
1980.0           0.0        0.0     0.0        0.0        72.0           30.0   
1984.0           0.0        0.0     0.0        0.0        72.0           30.0   
1988.0          18.0        0.0    24.0        0.0        72.0           30.0   
1992.0          24.0        0.0    24.0        0.0        72.0           29.0   
1996.0          18.0        0.0    18.0        0.0        84.0           30.0   
2000.0          18.0       24.0    18.0        6.0        84.0           45.0   
2004.0          18.0       24.0    16.0        6.0        83.0           45.0   
2008.0          24.0       32.0    18.0        6.0        84.0           45.0   

Sport   Wrestling  
Year               
1976.0       60.0  
1980.0       60.0  
1984.0       60.0  
1988.0       60.0  
1992.0       60.0  
1996.0       60.0  
2000.0       48.0  
2004.0       54.0  
2008.0       71.0  

[9 rows x 28 columns]
<Figure size 1400x800 with 0 Axes>
No description has been provided for this image
In [11]:
#Impact of Geopolitical Events:
#Examine the impact of major geopolitical events, like wars or boycotts, on the participation and success rates in the Olympics.
#Study the return or debut of nations in the Olympic games and their performances.
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the data
df = pd.read_csv("C:\\Users\\LENOVO\\Desktop\\Summer-Olympic-medals-1976-to-2008.csv", encoding='latin1')

# Define the years of major geopolitical events
geopolitical_years = [1980, 1984]

# Analyze participation rates around these events
participation_by_year = df.groupby('Year')['Country'].nunique()
print(participation_by_year)

plt.figure(figsize=(12, 8))
participation_by_year.plot(kind='bar')
plt.axvline(x=participation_by_year.index.get_loc(1980), color='red', linestyle='--', label='1980 Moscow Boycott')
plt.axvline(x=participation_by_year.index.get_loc(1984), color='blue', linestyle='--', label='1984 LA Boycott')
plt.title('Number of Participating Countries in the Olympics (1976-2008)')
plt.xlabel('Year')
plt.ylabel('Number of Participating Countries')
plt.legend()
plt.show()

# Analyze success rates (medals won) around these events
medals_by_country_year = df.groupby(['Year', 'Country']).size().unstack().fillna(0)
print(medals_by_country_year)

top_countries = medals_by_country_year.sum().sort_values(ascending=False).head(5).index

plt.figure(figsize=(14, 8))
medals_by_country_year[top_countries].plot(kind='line', marker='o')
plt.axvline(x=participation_by_year.index.get_loc(1980), color='red', linestyle='--', label='1980 Moscow Boycott')
plt.axvline(x=participation_by_year.index.get_loc(1984), color='blue', linestyle='--', label='1984 LA Boycott')
plt.title('Total Medals Won by Top 5 Countries (1976-2008)')
plt.xlabel('Year')
plt.ylabel('Number of Medals')
plt.legend(title='Country')
plt.show()

# Study the return or debut of nations
debut_or_return = df.groupby('Country')['Year'].min().reset_index()
print(debut_or_return)

debut_1980 = df[df['Year'] == 1980]['Country'].unique()
debut_1984 = df[df['Year'] == 1984]['Country'].unique()

medals_by_debut_1980 = df[(df['Country'].isin(debut_1980)) & (df['Year'] == 1980)]['Country'].value_counts()
medals_by_debut_1984 = df[(df['Country'].isin(debut_1984)) & (df['Year'] == 1984)]['Country'].value_counts()

plt.figure(figsize=(12, 8))
sns.barplot(y=medals_by_debut_1980.index, x=medals_by_debut_1980.values)
plt.title('Medals Won by Debuting Nations in 1980')
plt.xlabel('Number of Medals')
plt.ylabel('Country')
plt.show()

plt.figure(figsize=(12, 8))
sns.barplot(y=medals_by_debut_1984.index, x=medals_by_debut_1984.values)
plt.title('Medals Won by Debuting Nations in 1984')
plt.xlabel('Number of Medals')
plt.ylabel('Country')
plt.show()
Year
1976.0    41
1980.0    36
1984.0    47
1988.0    52
1992.0    64
1996.0    79
2000.0    80
2004.0    74
2008.0    86
Name: Country, dtype: int64
No description has been provided for this image
Country  Afghanistan  Algeria  Argentina  Armenia  Australia  Austria  \
Year                                                                    
1976.0           0.0      0.0        0.0      0.0       24.0      1.0   
1980.0           0.0      0.0        0.0      0.0       12.0      5.0   
1984.0           0.0      2.0        0.0      0.0       50.0      3.0   
1988.0           0.0      0.0       13.0      0.0       34.0      1.0   
1992.0           0.0      2.0        2.0      0.0       57.0      6.0   
1996.0           0.0      3.0       20.0      2.0      132.0      3.0   
2000.0           0.0      5.0       20.0      1.0      183.0      4.0   
2004.0           0.0      0.0       47.0      0.0      157.0      8.0   
2008.0           1.0      2.0       51.0      6.0      149.0      3.0   

Country  Azerbaijan  Bahamas  Barbados  Belarus  ...  United States  Uruguay  \
Year                                             ...                           
1976.0          0.0      0.0       0.0      0.0  ...          155.0      0.0   
1980.0          0.0      0.0       0.0      0.0  ...            0.0      0.0   
1984.0          0.0      0.0       0.0      0.0  ...          333.0      0.0   
1988.0          0.0      0.0       0.0      0.0  ...          193.0      0.0   
1992.0          0.0      1.0       0.0      0.0  ...          224.0      0.0   
1996.0          1.0      5.0       0.0     23.0  ...          260.0      0.0   
2000.0          3.0      6.0       1.0     22.0  ...          248.0      1.0   
2004.0          5.0      2.0       0.0     17.0  ...          264.0      0.0   
2008.0          7.0      5.0       0.0     30.0  ...          315.0      0.0   

Country  Uzbekistan  Venezuela  Vietnam  Virgin Islands*  West Germany  \
Year                                                                     
1976.0          0.0        1.0      0.0              0.0          75.0   
1980.0          0.0        1.0      0.0              0.0           0.0   
1984.0          0.0        3.0      0.0              0.0         157.0   
1988.0          0.0        0.0      0.0              1.0         113.0   
1992.0          0.0        0.0      0.0              0.0           0.0   
1996.0          2.0        0.0      0.0              0.0           0.0   
2000.0          4.0        0.0      1.0              0.0           0.0   
2004.0          5.0        2.0      0.0              0.0           0.0   
2008.0          6.0        1.0      1.0              0.0           0.0   

Country  Yugoslavia  Zambia  Zimbabwe  
Year                                   
1976.0         19.0     0.0       0.0  
1980.0         57.0     0.0      16.0  
1984.0         87.0     1.0       0.0  
1988.0         63.0     0.0       0.0  
1992.0          0.0     0.0       0.0  
1996.0         26.0     1.0       0.0  
2000.0         26.0     0.0       0.0  
2004.0          0.0     0.0       3.0  
2008.0          0.0     0.0       4.0  

[9 rows x 127 columns]
<Figure size 1400x800 with 0 Axes>
No description has been provided for this image
             Country    Year
0        Afghanistan  2008.0
1            Algeria  1984.0
2          Argentina  1988.0
3            Armenia  1996.0
4          Australia  1976.0
..               ...     ...
122  Virgin Islands*  1988.0
123     West Germany  1976.0
124       Yugoslavia  1976.0
125           Zambia  1984.0
126         Zimbabwe  1980.0

[127 rows x 2 columns]
No description has been provided for this image
No description has been provided for this image
In [27]:
#Predictive Analysis for Future Olympics:
#Use historical data to predict future medal counts and performances for upcoming Olympics.
#Develop models to forecast which countries might rise as new leaders in certain sports.
import pandas as pd
import numpy as np
# Convert the 'Year' column to an integer
#df['Year'] = df['Year'].astype(int)

# Check for missing values
df.isnull().sum()

# Fill missing values or drop rows/columns as appropriate
# In this case, we assume the dataset is clean and complete
Out[27]:
City            117
Year            117
Sport           117
Discipline      117
Event           117
Athlete         117
Gender          117
Country_Code    117
Country         117
Event_gender    117
Medal           117
dtype: int64
In [13]:
import matplotlib.pyplot as plt
import seaborn as sns

# Total medals won by country over the years
medals_by_country = df.groupby('Country').size().sort_values(ascending=False)
print(medals_by_country.head(10))

plt.figure(figsize=(12, 8))
sns.barplot(y=medals_by_country.index[:10], x=medals_by_country.values[:10])
plt.title('Total Medals Won by Country (Top 10)')
plt.xlabel('Number of Medals')
plt.ylabel('Country')
plt.show()
Country
United States     1992
Soviet Union      1021
Australia          798
Germany            691
China              679
Russia             638
East Germany       626
Italy              486
Romania            482
United Kingdom     467
dtype: int64
No description has been provided for this image
In [31]:
# Import necessary libraries
import pandas as pd
import statsmodels.api as sm
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Load and preprocess the data
df = pd.read_csv("C:\\Users\\LENOVO\\Desktop\\Summer-Olympic-medals-1976-to-2008.csv", encoding='latin1')
#df['Year'] = df['Year'].astype(int)
medals_by_country_sport_year = df.groupby(['Year', 'Country', 'Sport']).size().reset_index(name='Medal Count')

# Time Series Analysis
sport_data = medals_by_country_sport_year[medals_by_country_sport_year['Sport'] == 'Athletics']
pivot_table = sport_data.pivot(index='Year', columns='Country', values='Medal Count').fillna(0)

# Ensure the country data has enough points for ARIMA
if 'USA' in pivot_table.columns:
    country_data = pivot_table['USA']
    if len(country_data) > 0:
        plt.figure(figsize=(12, 6))
        plt.plot(country_data.index, country_data.values)
        plt.xlabel('Year')
        plt.ylabel('Medal Count')
        plt.title('Medal Count for USA in Athletics Over the Years')
        plt.show()

        try:
            model = sm.tsa.ARIMA(country_data, order=(1, 1, 1))
            results = model.fit()
            forecast = results.forecast(steps=4)  # Predict for the next 4 Olympic games (16 years)
            print("Forecasted Medal Count for USA in Athletics for the next 4 Olympics:", forecast)
        except Exception as e:
            print("ARIMA model fitting/forecasting error:", e)
    else:
        print("Not enough data for ARIMA model.")
else:
    print("USA data not available in the dataset.")

# Machine Learning Approach
features = medals_by_country_sport_year[['Year', 'Country', 'Sport']]
features = pd.get_dummies(features)
target = medals_by_country_sport_year['Medal Count']

# Ensure there are enough data points
if not features.empty:
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

    # Check for sufficient training data
    if len(X_train) > 0 and len(y_train) > 0:
        # Initialize and train the Random Forest model
        model = RandomForestRegressor(n_estimators=100, random_state=42)
        model.fit(X_train, y_train)

        # Make predictions
        predictions = model.predict(X_test)

        # Evaluate the model
        mse = mean_squared_error(y_test, predictions)
        print(f"Mean Squared Error: {mse}")

        # Feature importance
        importances = model.feature_importances_
        feature_names = features.columns
        sorted_importances = sorted(zip(importances, feature_names), reverse=True)

        print("Top 10 Feature Importances:")
        for importance, name in sorted_importances[:10]:
            print(f"{name}: {importance:.4f}")
    else:
        print("Not enough data for training the Random Forest model.")
else:
    print("Not enough data for Random Forest model.")
USA data not available in the dataset.
Mean Squared Error: 15.310240155945417
Top 10 Feature Importances:
Sport_Aquatics: 0.1760
Year: 0.1156
Country_United States: 0.0996
Sport_Athletics: 0.0589
Country_Soviet Union: 0.0560
Sport_Hockey: 0.0459
Sport_Handball: 0.0371
Sport_Rowing: 0.0363
Country_East Germany: 0.0353
Sport_Football: 0.0316
In [ ]:
#Data Visualization and Reporting:
#Create dynamic visualizations and dashboards that allow stakeholders to interact with the data and extract personalized insights.
#Prepare a detailed report summarizing the findings, methodologies, and implications for Olympic committees and sports analysts.
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [21]:
pip install plotly dash pandas
Requirement already satisfied: plotly in c:\users\lenovo\anaconda\lib\site-packages (5.9.0)
Collecting dash
  Downloading dash-2.17.1-py3-none-any.whl.metadata (10 kB)
Requirement already satisfied: pandas in c:\users\lenovo\anaconda\lib\site-packages (2.1.4)
Requirement already satisfied: tenacity>=6.2.0 in c:\users\lenovo\anaconda\lib\site-packages (from plotly) (8.2.2)
Requirement already satisfied: Flask<3.1,>=1.0.4 in c:\users\lenovo\anaconda\lib\site-packages (from dash) (2.2.5)
Requirement already satisfied: Werkzeug<3.1 in c:\users\lenovo\anaconda\lib\site-packages (from dash) (2.2.3)
Collecting dash-html-components==2.0.0 (from dash)
  Downloading dash_html_components-2.0.0-py3-none-any.whl.metadata (3.8 kB)
Collecting dash-core-components==2.0.0 (from dash)
  Downloading dash_core_components-2.0.0-py3-none-any.whl.metadata (2.9 kB)
Collecting dash-table==5.0.0 (from dash)
  Downloading dash_table-5.0.0-py3-none-any.whl.metadata (2.4 kB)
Requirement already satisfied: importlib-metadata in c:\users\lenovo\anaconda\lib\site-packages (from dash) (7.0.1)
Requirement already satisfied: typing-extensions>=4.1.1 in c:\users\lenovo\anaconda\lib\site-packages (from dash) (4.9.0)
Requirement already satisfied: requests in c:\users\lenovo\anaconda\lib\site-packages (from dash) (2.31.0)
Collecting retrying (from dash)
  Downloading retrying-1.3.4-py3-none-any.whl.metadata (6.9 kB)
Requirement already satisfied: nest-asyncio in c:\users\lenovo\anaconda\lib\site-packages (from dash) (1.6.0)
Requirement already satisfied: setuptools in c:\users\lenovo\anaconda\lib\site-packages (from dash) (68.2.2)
Requirement already satisfied: numpy<2,>=1.23.2 in c:\users\lenovo\anaconda\lib\site-packages (from pandas) (1.26.4)
Requirement already satisfied: python-dateutil>=2.8.2 in c:\users\lenovo\anaconda\lib\site-packages (from pandas) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in c:\users\lenovo\anaconda\lib\site-packages (from pandas) (2023.3.post1)
Requirement already satisfied: tzdata>=2022.1 in c:\users\lenovo\anaconda\lib\site-packages (from pandas) (2023.3)
Requirement already satisfied: Jinja2>=3.0 in c:\users\lenovo\anaconda\lib\site-packages (from Flask<3.1,>=1.0.4->dash) (3.1.3)
Requirement already satisfied: itsdangerous>=2.0 in c:\users\lenovo\anaconda\lib\site-packages (from Flask<3.1,>=1.0.4->dash) (2.0.1)
Requirement already satisfied: click>=8.0 in c:\users\lenovo\anaconda\lib\site-packages (from Flask<3.1,>=1.0.4->dash) (8.1.7)
Requirement already satisfied: six>=1.5 in c:\users\lenovo\anaconda\lib\site-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)
Requirement already satisfied: MarkupSafe>=2.1.1 in c:\users\lenovo\anaconda\lib\site-packages (from Werkzeug<3.1->dash) (2.1.3)
Requirement already satisfied: zipp>=0.5 in c:\users\lenovo\anaconda\lib\site-packages (from importlib-metadata->dash) (3.17.0)
Requirement already satisfied: charset-normalizer<4,>=2 in c:\users\lenovo\anaconda\lib\site-packages (from requests->dash) (2.0.4)
Requirement already satisfied: idna<4,>=2.5 in c:\users\lenovo\anaconda\lib\site-packages (from requests->dash) (3.4)
Requirement already satisfied: urllib3<3,>=1.21.1 in c:\users\lenovo\anaconda\lib\site-packages (from requests->dash) (2.0.7)
Requirement already satisfied: certifi>=2017.4.17 in c:\users\lenovo\anaconda\lib\site-packages (from requests->dash) (2024.2.2)
Requirement already satisfied: colorama in c:\users\lenovo\anaconda\lib\site-packages (from click>=8.0->Flask<3.1,>=1.0.4->dash) (0.4.6)
Downloading dash-2.17.1-py3-none-any.whl (7.5 MB)
   ---------------------------------------- 0.0/7.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/7.5 MB 1.4 MB/s eta 0:00:06
   ---------------------------------------- 0.0/7.5 MB 653.6 kB/s eta 0:00:12
   ---------------------------------------- 0.1/7.5 MB 550.5 kB/s eta 0:00:14
   ---------------------------------------- 0.1/7.5 MB 550.5 kB/s eta 0:00:14
   ---------------------------------------- 0.1/7.5 MB 459.5 kB/s eta 0:00:17
   ---------------------------------------- 0.1/7.5 MB 403.5 kB/s eta 0:00:19
    --------------------------------------- 0.1/7.5 MB 425.1 kB/s eta 0:00:18
    --------------------------------------- 0.1/7.5 MB 448.2 kB/s eta 0:00:17
    --------------------------------------- 0.2/7.5 MB 393.8 kB/s eta 0:00:19
    --------------------------------------- 0.2/7.5 MB 393.8 kB/s eta 0:00:19
    --------------------------------------- 0.2/7.5 MB 338.5 kB/s eta 0:00:22
   - -------------------------------------- 0.3/7.5 MB 505.4 kB/s eta 0:00:15
   -- ------------------------------------- 0.4/7.5 MB 672.6 kB/s eta 0:00:11
   -- ------------------------------------- 0.5/7.5 MB 720.0 kB/s eta 0:00:10
   -- ------------------------------------- 0.5/7.5 MB 731.4 kB/s eta 0:00:10
   --- ------------------------------------ 0.6/7.5 MB 784.4 kB/s eta 0:00:09
   --- ------------------------------------ 0.7/7.5 MB 865.4 kB/s eta 0:00:08
   ---- ----------------------------------- 0.8/7.5 MB 927.5 kB/s eta 0:00:08
   ---- ----------------------------------- 0.8/7.5 MB 936.2 kB/s eta 0:00:08
   ---- ----------------------------------- 0.9/7.5 MB 921.8 kB/s eta 0:00:08
   ---- ----------------------------------- 0.9/7.5 MB 921.8 kB/s eta 0:00:08
   ---- ----------------------------------- 0.9/7.5 MB 901.1 kB/s eta 0:00:08
   ---- ----------------------------------- 0.9/7.5 MB 880.2 kB/s eta 0:00:08
   ---- ----------------------------------- 0.9/7.5 MB 880.2 kB/s eta 0:00:08
   ----- ---------------------------------- 1.0/7.5 MB 835.4 kB/s eta 0:00:08
   ----- ---------------------------------- 1.0/7.5 MB 856.2 kB/s eta 0:00:08
   ----- ---------------------------------- 1.0/7.5 MB 832.3 kB/s eta 0:00:08
   ----- ---------------------------------- 1.1/7.5 MB 852.0 kB/s eta 0:00:08
   ------ --------------------------------- 1.1/7.5 MB 876.4 kB/s eta 0:00:08
   ------ --------------------------------- 1.2/7.5 MB 902.2 kB/s eta 0:00:07
   ------ --------------------------------- 1.3/7.5 MB 920.9 kB/s eta 0:00:07
   ------- -------------------------------- 1.4/7.5 MB 979.4 kB/s eta 0:00:07
   ------- -------------------------------- 1.5/7.5 MB 972.9 kB/s eta 0:00:07
   -------- ------------------------------- 1.6/7.5 MB 1.0 MB/s eta 0:00:06
   -------- ------------------------------- 1.6/7.5 MB 1.0 MB/s eta 0:00:06
   --------- ------------------------------ 1.8/7.5 MB 1.1 MB/s eta 0:00:06
   --------- ------------------------------ 1.8/7.5 MB 1.1 MB/s eta 0:00:06
   ---------- ----------------------------- 1.9/7.5 MB 1.1 MB/s eta 0:00:05
   ---------- ----------------------------- 2.0/7.5 MB 1.2 MB/s eta 0:00:05
   ----------- ---------------------------- 2.2/7.5 MB 1.2 MB/s eta 0:00:05
   ----------- ---------------------------- 2.2/7.5 MB 1.2 MB/s eta 0:00:05
   ------------ --------------------------- 2.3/7.5 MB 1.2 MB/s eta 0:00:05
   ------------ --------------------------- 2.3/7.5 MB 1.2 MB/s eta 0:00:05
   ------------ --------------------------- 2.3/7.5 MB 1.2 MB/s eta 0:00:05
   ------------ --------------------------- 2.4/7.5 MB 1.2 MB/s eta 0:00:05
   ------------ --------------------------- 2.4/7.5 MB 1.2 MB/s eta 0:00:05
   ------------- -------------------------- 2.5/7.5 MB 1.2 MB/s eta 0:00:05
   ------------- -------------------------- 2.5/7.5 MB 1.2 MB/s eta 0:00:05
   ------------- -------------------------- 2.6/7.5 MB 1.2 MB/s eta 0:00:05
   -------------- ------------------------- 2.7/7.5 MB 1.2 MB/s eta 0:00:05
   -------------- ------------------------- 2.7/7.5 MB 1.2 MB/s eta 0:00:04
   -------------- ------------------------- 2.7/7.5 MB 1.2 MB/s eta 0:00:04
   --------------- ------------------------ 2.8/7.5 MB 1.2 MB/s eta 0:00:04
   --------------- ------------------------ 2.9/7.5 MB 1.2 MB/s eta 0:00:04
   --------------- ------------------------ 2.9/7.5 MB 1.2 MB/s eta 0:00:04
   --------------- ------------------------ 3.0/7.5 MB 1.2 MB/s eta 0:00:04
   ---------------- ----------------------- 3.0/7.5 MB 1.2 MB/s eta 0:00:04
   ---------------- ----------------------- 3.1/7.5 MB 1.2 MB/s eta 0:00:04
   ---------------- ----------------------- 3.2/7.5 MB 1.2 MB/s eta 0:00:04
   ----------------- ---------------------- 3.2/7.5 MB 1.2 MB/s eta 0:00:04
   ----------------- ---------------------- 3.3/7.5 MB 1.2 MB/s eta 0:00:04
   ----------------- ---------------------- 3.4/7.5 MB 1.2 MB/s eta 0:00:04
   ------------------ --------------------- 3.4/7.5 MB 1.3 MB/s eta 0:00:04
   ------------------ --------------------- 3.5/7.5 MB 1.3 MB/s eta 0:00:04
   ------------------- -------------------- 3.6/7.5 MB 1.3 MB/s eta 0:00:04
   -------------------- ------------------- 3.8/7.5 MB 1.3 MB/s eta 0:00:03
   -------------------- ------------------- 3.9/7.5 MB 1.3 MB/s eta 0:00:03
   -------------------- ------------------- 3.9/7.5 MB 1.3 MB/s eta 0:00:03
   --------------------- ------------------ 4.0/7.5 MB 1.3 MB/s eta 0:00:03
   --------------------- ------------------ 4.1/7.5 MB 1.3 MB/s eta 0:00:03
   ---------------------- ----------------- 4.2/7.5 MB 1.3 MB/s eta 0:00:03
   ---------------------- ----------------- 4.2/7.5 MB 1.3 MB/s eta 0:00:03
   ---------------------- ----------------- 4.3/7.5 MB 1.3 MB/s eta 0:00:03
   ----------------------- ---------------- 4.4/7.5 MB 1.4 MB/s eta 0:00:03
   ------------------------ --------------- 4.5/7.5 MB 1.4 MB/s eta 0:00:03
   ------------------------ --------------- 4.6/7.5 MB 1.4 MB/s eta 0:00:03
   ------------------------ --------------- 4.7/7.5 MB 1.4 MB/s eta 0:00:03
   ------------------------- -------------- 4.8/7.5 MB 1.4 MB/s eta 0:00:02
   -------------------------- ------------- 4.9/7.5 MB 1.4 MB/s eta 0:00:02
   -------------------------- ------------- 5.0/7.5 MB 1.4 MB/s eta 0:00:02
   --------------------------- ------------ 5.1/7.5 MB 1.4 MB/s eta 0:00:02
   --------------------------- ------------ 5.2/7.5 MB 1.4 MB/s eta 0:00:02
   --------------------------- ------------ 5.2/7.5 MB 1.4 MB/s eta 0:00:02
   ---------------------------- ----------- 5.3/7.5 MB 1.4 MB/s eta 0:00:02
   ---------------------------- ----------- 5.3/7.5 MB 1.4 MB/s eta 0:00:02
   ----------------------------- ---------- 5.4/7.5 MB 1.4 MB/s eta 0:00:02
   ----------------------------- ---------- 5.5/7.5 MB 1.4 MB/s eta 0:00:02
   ----------------------------- ---------- 5.6/7.5 MB 1.4 MB/s eta 0:00:02
   ----------------------------- ---------- 5.6/7.5 MB 1.4 MB/s eta 0:00:02
   ------------------------------ --------- 5.7/7.5 MB 1.4 MB/s eta 0:00:02
   ------------------------------ --------- 5.7/7.5 MB 1.4 MB/s eta 0:00:02
   ------------------------------ --------- 5.7/7.5 MB 1.4 MB/s eta 0:00:02
   ------------------------------ --------- 5.8/7.5 MB 1.4 MB/s eta 0:00:02
   ------------------------------- -------- 5.8/7.5 MB 1.4 MB/s eta 0:00:02
   ------------------------------- -------- 5.8/7.5 MB 1.4 MB/s eta 0:00:02
   ------------------------------- -------- 5.9/7.5 MB 1.4 MB/s eta 0:00:02
   -------------------------------- ------- 6.0/7.5 MB 1.4 MB/s eta 0:00:02
   -------------------------------- ------- 6.1/7.5 MB 1.4 MB/s eta 0:00:02
   -------------------------------- ------- 6.1/7.5 MB 1.4 MB/s eta 0:00:02
   --------------------------------- ------ 6.2/7.5 MB 1.4 MB/s eta 0:00:01
   --------------------------------- ------ 6.3/7.5 MB 1.4 MB/s eta 0:00:01
   --------------------------------- ------ 6.3/7.5 MB 1.4 MB/s eta 0:00:01
   --------------------------------- ------ 6.3/7.5 MB 1.4 MB/s eta 0:00:01
   ---------------------------------- ----- 6.4/7.5 MB 1.3 MB/s eta 0:00:01
   ---------------------------------- ----- 6.4/7.5 MB 1.3 MB/s eta 0:00:01
   ---------------------------------- ----- 6.5/7.5 MB 1.3 MB/s eta 0:00:01
   ---------------------------------- ----- 6.5/7.5 MB 1.3 MB/s eta 0:00:01
   ---------------------------------- ----- 6.5/7.5 MB 1.3 MB/s eta 0:00:01
   ----------------------------------- ---- 6.6/7.5 MB 1.3 MB/s eta 0:00:01
   ----------------------------------- ---- 6.7/7.5 MB 1.3 MB/s eta 0:00:01
   ------------------------------------ --- 6.8/7.5 MB 1.3 MB/s eta 0:00:01
   ------------------------------------ --- 6.9/7.5 MB 1.3 MB/s eta 0:00:01
   ------------------------------------ --- 6.9/7.5 MB 1.3 MB/s eta 0:00:01
   ------------------------------------- -- 7.0/7.5 MB 1.3 MB/s eta 0:00:01
   ------------------------------------- -- 7.1/7.5 MB 1.3 MB/s eta 0:00:01
   -------------------------------------- - 7.2/7.5 MB 1.3 MB/s eta 0:00:01
   -------------------------------------- - 7.2/7.5 MB 1.3 MB/s eta 0:00:01
   -------------------------------------- - 7.2/7.5 MB 1.3 MB/s eta 0:00:01
   ---------------------------------------  7.3/7.5 MB 1.3 MB/s eta 0:00:01
   ---------------------------------------  7.4/7.5 MB 1.3 MB/s eta 0:00:01
   ---------------------------------------  7.5/7.5 MB 1.3 MB/s eta 0:00:01
   ---------------------------------------  7.5/7.5 MB 1.3 MB/s eta 0:00:01
   ---------------------------------------- 7.5/7.5 MB 1.3 MB/s eta 0:00:00
Downloading dash_core_components-2.0.0-py3-none-any.whl (3.8 kB)
Downloading dash_html_components-2.0.0-py3-none-any.whl (4.1 kB)
Downloading dash_table-5.0.0-py3-none-any.whl (3.9 kB)
Downloading retrying-1.3.4-py3-none-any.whl (11 kB)
Installing collected packages: dash-table, dash-html-components, dash-core-components, retrying, dash
Successfully installed dash-2.17.1 dash-core-components-2.0.0 dash-html-components-2.0.0 dash-table-5.0.0 retrying-1.3.4
Note: you may need to restart the kernel to use updated packages.
In [23]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

# Load the data
df = pd.read_csv("C:\\Users\\LENOVO\\Desktop\\Summer-Olympic-medals-1976-to-2008.csv", encoding='latin1')

# Convert 'Year' column to an integer
#df['Year'] = df['Year'].astype(int)

# Aggregate data by country and year
medals_by_country_year = df.groupby(['Year', 'Country']).size().reset_index(name='Medal Count')

# Pivot the table for easier visualization
pivot_table = medals_by_country_year.pivot(index='Year', columns='Country', values='Medal Count').fillna(0)
pivot_table = pivot_table.reset_index()
In [24]:
# Line plot for total medals won by top 10 countries over the years
top_10_countries = df['Country'].value_counts().index[:10]
medals_by_year_top_10 = medals_by_country_year[medals_by_country_year['Country'].isin(top_10_countries)]

fig = px.line(medals_by_year_top_10, x='Year', y='Medal Count', color='Country', title='Total Medals Won by Top 10 Countries Over the Years')
fig.show()

# Bar plot for medals won by country in a specific year
year = 2008
medals_2008 = medals_by_country_year[medals_by_country_year['Year'] == year]
fig = px.bar(medals_2008, x='Country', y='Medal Count', title=f'Medals Won by Country in {year}')
fig.show()

# Heatmap for medals won by country and year
fig = px.imshow(pivot_table.set_index('Year'), title='Heatmap of Medals Won by Country and Year')
fig.show()
In [25]:
import dash
from dash import dcc, html
from dash.dependencies import Input, Output

# Initialize the Dash app
app = dash.Dash(__name__)

# Layout of the app
app.layout = html.Div([
    html.H1("Olympic Medals Dashboard"),
    dcc.Dropdown(
        id='country-dropdown',
        options=[{'label': country, 'value': country} for country in df['Country'].unique()],
        value='USA',
        multi=True
    ),
    dcc.Graph(id='medals-line-plot'),
    dcc.Slider(
        id='year-slider',
        min=df['Year'].min(),
        max=df['Year'].max(),
        value=df['Year'].max(),
        marks={str(year): str(year) for year in df['Year'].unique()},
        step=None
    ),
    dcc.Graph(id='medals-bar-plot')
])

# Callback for updating the line plot based on selected countries
@app.callback(
    Output('medals-line-plot', 'figure'),
    Input('country-dropdown', 'value')
)
def update_line_plot(selected_countries):
    if not selected_countries:
        return {}
    medals_by_year_selected = medals_by_country_year[medals_by_country_year['Country'].isin(selected_countries)]
    fig = px.line(medals_by_year_selected, x='Year', y='Medal Count', color='Country', title='Total Medals Won by Selected Countries Over the Years')
    return fig

# Callback for updating the bar plot based on selected year
@app.callback(
    Output('medals-bar-plot', 'figure'),
    Input('year-slider', 'value')
)
def update_bar_plot(selected_year):
    medals_selected_year = medals_by_country_year[medals_by_country_year['Year'] == selected_year]
    fig = px.bar(medals_selected_year, x='Country', y='Medal Count', title=f'Medals Won by Country in {selected_year}')
    return fig

# Run the app
if __name__ == '__main__':
    app.run_server(debug=True)
In [ ]: